Análisis de https://www.nature.com/articles/srep00196.pdf
Podemos usar read_lines_chunked si el archivo original es grande. En este ejemplo, filtramos las recetas East Asian:
library(tidyverse)
limpiar <- function(lineas,...){
str_split(lineas, ',') %>%
keep(~.x[1] == 'EastAsian') %>%
map(~.x[-1]) %>% # quitar tipo de cocina
map(~.x[nchar(.x) > 0]) # quitar elementos vacios
}
callback_limpiar <- ListCallback$new(limpiar)
filtrado <- read_lines_chunked('../datos/recetas/srep00196-s3.csv',
skip = 1, callback = callback_limpiar, chunk_size = 1000)
recetas <- filtrado %>% flatten
recetas[1:10]
[[1]]
[1] "beef_broth" "egg" "soy_sauce" "soybean"
[[2]]
[1] "sesame_oil" "beef" "roasted_sesame_seed" "matsutake"
[5] "black_pepper" "scallion" "garlic" "soy_sauce"
[[3]]
[1] "vinegar" "roasted_sesame_seed" "cayenne" "scallion"
[5] "garlic" "soybean" "cucumber" "rice"
[[4]]
[1] "beef" "roasted_sesame_seed" "soy_sauce" "cayenne"
[5] "ginger" "scallion" "lettuce" "garlic"
[9] "vegetable" "sake"
[[5]]
[1] "garlic" "fish" "cayenne" "soy_sauce" "potato"
[[6]]
[1] "sweet_potato" "onion" "roasted_sesame_seed" "soy_sauce"
[5] "cayenne" "ginger" "soybean" "vegetable"
[9] "cabbage" "rice" "chicken" "sesame_oil"
[[7]]
[1] "sesame_oil" "radish" "fish" "black_pepper"
[5] "ginger" "garlic" "seaweed" "shrimp"
[9] "beef" "roasted_sesame_seed" "soy_sauce" "cayenne"
[13] "chinese_cabbage" "scallion" "sesame_seed" "rice"
[[8]]
[1] "vinegar" "radish" "fish" "cayenne" "scallion" "cucumber" "soybean"
[8] "vegetable" "garlic" "rice" "soy_sauce"
[[9]]
[1] "radish" "fish" "cayenne" "ginger" "scallion"
[6] "garlic" "vegetable_oil" "soy_sauce"
[[10]]
[1] "nut" "cucumber" "sesame_seed" "soybean"
library(arules)
length(recetas)
[1] 2512
## No hacer mucho más chico que este soporte, pues tenemos relativamente
## pocas transacciones:
pars <- list(support = 0.05, target = 'frequent itemsets',
ext = TRUE)
ap_recetas <- apriori(recetas, parameter = pars)
Apriori
Parameter specification:
Algorithmic control:
Absolute minimum support count: 125
set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[242 item(s), 2512 transaction(s)] done [0.00s].
sorting and recoding items ... [41 item(s)] done [0.00s].
creating transaction tree ... done [0.00s].
checking subsets of size 1 2 3 4 5 6 done [0.00s].
sorting transactions ... done [0.00s].
writing ... [628 set(s)] done [0.00s].
creating S4 object ... done [0.00s].
length(ap_recetas)
[1] 628
Vemos los items frecuentes
frecs <- ap_recetas %>% subset(size(.) == 1 ) %>% sort(by = 'support') %>%
DATAFRAME
DT::datatable(frecs %>% mutate_if(is.numeric, function(x) round(x, 3)))
Registered S3 method overwritten by 'htmlwidgets':
method from
print.htmlwidget tools:rstudio
Y ahora examinamos combinaciones frecuentes de distintos tamaños
ap_recetas %>%
subset(size(.) == 2) %>%
subset(support > 0.20) %>%
sort(by = 'support') %>%
inspect
Incluso hay algunas combinaciones de 4 ingredientes que ocurren con frecuencia alta: estos ingredientes son bases de salsas, combinaciones de condimentos:
ap_recetas %>%
subset(size(.) == 4) %>%
subset(support > 0.10) %>%
sort(by = 'support') %>%
inspect
pars <- list(support = 0.01, confidence = 0.10,
target = 'rules',
ext = TRUE)
reglas_recetas <- apriori(recetas, parameter = pars)
Apriori
Parameter specification:
Algorithmic control:
Absolute minimum support count: 25
set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[242 item(s), 2512 transaction(s)] done [0.00s].
sorting and recoding items ... [88 item(s)] done [0.00s].
creating transaction tree ... done [0.00s].
checking subsets of size 1 2 3 4 5 6 7 8 done [0.02s].
writing ... [50181 rule(s)] done [0.01s].
creating S4 object ... done [0.01s].
agregar_hyperlift <- function(reglas, trans){
quality(reglas) <- cbind(quality(reglas),
hyper_lift = interestMeasure(reglas, measure = "hyperLift",
transactions = trans))
reglas
}
reglas_recetas <- agregar_hyperlift(reglas_recetas, recetas)
library(arulesViz)
Loading required package: grid
Registered S3 method overwritten by 'data.table':
method from
print.data.table
reglas_1 <- subset(reglas_recetas, hyper_lift > 1.1 & support > 0.1 & confidence > 0.40)
length(reglas_1)
[1] 213
reglas_tam_2 <- subset(reglas_1, size(reglas_1)==2)
#inspect(reglas_tam_2 %>% sort(by = 'hyper_lift'))
plot(reglas_1 %>% subset(support > 0.2), engine = "plotly")
To reduce overplotting, jitter is added! Use jitter = 0 to prevent jitter.
`arrange_()` is deprecated as of dplyr 0.7.0.
Please use `arrange()` instead.
See vignette('programming') for more help
This warning is displayed once every 8 hours.
Call `lifecycle::last_warnings()` to see where this warning was generated.
library(tidygraph)
Attaching package: ‘tidygraph’
The following object is masked from ‘package:stats’:
filter
library(ggraph)
frecs <-
df_reglas <- reglas_tam_2 %>% DATAFRAME %>% rename(from=LHS, to=RHS) %>% data.frame
df_reglas$weight <- log(df_reglas$lift)
graph_1 <- as_tbl_graph(df_reglas) %>%
mutate(centrality = centrality_degree(mode = "all"))
set.seed(881)
ggraph(graph_1, layout = 'fr') +
geom_edge_link(aes(alpha=lift),
colour = 'red',
arrow = arrow(length = unit(4, 'mm'))) +
geom_node_point(aes(size = centrality, colour = centrality)) +
geom_node_text(aes(label = name), size=4,
colour = 'gray20', repel=TRUE) +
theme_graph(base_family = "sans")
reglas_1 <- subset(reglas_recetas, hyper_lift > 1.5 & confidence > 0.1)
length(reglas_1)
[1] 11068
reglas_tam_2 <- subset(reglas_1, size(reglas_1)==2)
length(reglas_tam_2)
[1] 132
library(tidygraph)
library(ggraph)
df_reglas <- reglas_tam_2 %>% DATAFRAME %>% rename(from=LHS, to=RHS) %>% as_data_frame
`as_data_frame()` is deprecated as of tibble 2.0.0.
Please use `as_tibble()` instead.
The signature and semantics have changed, see `?as_tibble`.
This warning is displayed once every 8 hours.
Call `lifecycle::last_warnings()` to see where this warning was generated.
df_reglas$weight <- log(df_reglas$hyper_lift)
graph_1 <- as_tbl_graph(df_reglas) %>%
mutate(centrality = centrality_degree(mode = "all"))
ggraph(graph_1, layout = 'fr', start.temp=100) +
geom_edge_link(aes(alpha=lift),
colour = 'red',
arrow = arrow(length = unit(4, 'mm'))) +
geom_node_point(aes(size = centrality, colour = centrality)) +
geom_node_text(aes(label = name), size=4,
colour = 'gray20', repel=TRUE) +
theme_graph(base_family = "sans")
Exportamos para examinar en Gephi:
write_csv(df_reglas %>% rename(source=from, target=to) %>%
select(-count),
path='reglas.csv')
The `path` argument of `write_csv()` is deprecated as of readr 1.4.0.
Please use the `file` argument instead.
This warning is displayed once every 8 hours.
Call `lifecycle::last_warnings()` to see where this warning was generated.
La combinación corn y starch puede deberse en parte a una separación incorrecta en el procesamiento de los datos (corn starch o maizena convertido en dos ingredientes, corn y starch):
df_reglas %>% filter(from == "{corn}", to == "{starch}")
La confianza es considerablemente alta, aunque tenemos pocos datos de esta combinación. Podemos examinar algunos ejemplos:
recetas %>% keep(~ "tomato" %in% .x & "corn" %in% .x) %>% head(10)
[[1]]
[1] "tomato" "vinegar" "pork" "celery_oil" "leek"
[6] "corn" "black_pepper" "pepper" "ginger" "pea"
[11] "garlic" "soybean" "soy_sauce" "chicken_broth" "wine"
[[2]]
[1] "tomato" "vinegar" "pepper" "celery_oil" "corn" "cayenne"
[7] "pork" "garlic" "soybean" "vegetable" "coriander" "rice"
[13] "soy_sauce"
[[3]]
[1] "tomato" "vinegar" "pork" "celery_oil" "soy_sauce" "ginger"
[7] "garlic" "sherry" "corn"
[[4]]
[1] "pepper" "celery_oil" "starch" "corn" "ginger"
[6] "garlic" "soybean" "tomato" "vinegar" "beef"
[11] "soy_sauce" "cayenne" "scallion" "bell_pepper" "vegetable_oil"
[16] "rice" "wine"
[[5]]
[1] "tomato" "vinegar" "pork" "celery_oil" "beef" "soy_sauce"
[7] "ginger" "garlic" "corn" "wine"
[[6]]
[1] "tomato" "vinegar" "pepper" "lemon_juice" "celery_oil" "sake"
[7] "corn" "pork" "ginger" "honey" "garlic" "soybean"
[13] "rice" "soy_sauce"
[[7]]
[1] "tomato" "garlic" "onion" "bacon" "corn" "cayenne" "egg"
[[8]]
[1] "pork" "green_bell_pepper" "celery_oil" "starch"
[5] "corn" "garlic" "tomato" "vinegar"
[9] "onion" "soy_sauce" "cider" "scallion"
[13] "celery" "pineapple" "vegetable_oil" "egg"
[[9]]
[1] "tomato" "vinegar" "pepper" "celery_oil" "roasted_pork"
[6] "soy_sauce" "ginger" "honey" "garlic" "cinnamon"
[11] "soybean" "sherry" "corn" "oyster"
[[10]]
[1] "cane_molasses" "tomato" "pork" "celery_oil" "vinegar"
[6] "soy_sauce" "pepper" "ginger" "garlic" "corn"